In [143]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df_2015 = pd.read_csv(r'/Users/minhnguyetnguyen/Documents/Data_Analytics/DANA/Assignment_2/Data/JakartaSouth_PM2.5_2015_YTD.csv')
df_2016 = pd.read_csv(r'/Users/minhnguyetnguyen/Documents/Data_Analytics/DANA/Assignment_2/Data/JakartaSouth_PM2.5_2016_YTD.csv')
df_2017 = pd.read_csv(r'/Users/minhnguyetnguyen/Documents/Data_Analytics/DANA/Assignment_2/Data/JakartaSouth_PM2.5_2017_YTD.csv')
df_2018 = pd.read_csv(r'/Users/minhnguyetnguyen/Documents/Data_Analytics/DANA/Assignment_2/Data/JakartaSouth_PM2.5_2018_YTD.csv')
df_2019 = pd.read_csv(r'/Users/minhnguyetnguyen/Documents/Data_Analytics/DANA/Assignment_2/Data/JakartaSouth_PM2.5_2019_YTD.csv')
df_2020 = pd.read_csv(r'/Users/minhnguyetnguyen/Documents/Data_Analytics/DANA/Assignment_2/Data/JakartaSouth_PM2.5_2020_YTD.csv')
df_2021 = pd.read_csv(r'/Users/minhnguyetnguyen/Documents/Data_Analytics/DANA/Assignment_2/Data/JakartaSouth_PM2.5_2021_YTD.csv')
df_2022 = pd.read_csv(r'/Users/minhnguyetnguyen/Documents/Data_Analytics/DANA/Assignment_2/Data/JakartaSouth_PM2.5_2022_YTD.csv')
df_2023 = pd.read_csv(r'/Users/minhnguyetnguyen/Documents/Data_Analytics/DANA/Assignment_2/Data/JakartaSouth_PM2.5_2023_YTD.csv')
In [144]:
df_2015.info(),df_2016.info(),df_2017.info(),df_2018.info(),df_2019.info(),df_2020.info(),df_2021.info(),df_2022.info(),df_2023.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Site           8760 non-null   object 
 1   Parameter      8760 non-null   object 
 2   Date (LT)      8760 non-null   object 
 3   Year           8760 non-null   int64  
 4   Month          8760 non-null   int64  
 5   Day            8760 non-null   int64  
 6   Hour           8760 non-null   int64  
 7   NowCast Conc.  8760 non-null   float64
 8   AQI            8760 non-null   int64  
 9   AQI Category   259 non-null    object 
 10  Raw Conc.      8760 non-null   float64
 11  Conc. Unit     8760 non-null   object 
 12  Duration       8760 non-null   object 
 13  QC Name        8760 non-null   object 
dtypes: float64(2), int64(5), object(7)
memory usage: 958.2+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8784 entries, 0 to 8783
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Site           8784 non-null   object 
 1   Parameter      8784 non-null   object 
 2   Date (LT)      8784 non-null   object 
 3   Year           8784 non-null   int64  
 4   Month          8784 non-null   int64  
 5   Day            8784 non-null   int64  
 6   Hour           8784 non-null   int64  
 7   NowCast Conc.  8784 non-null   float64
 8   AQI            8784 non-null   int64  
 9   AQI Category   8705 non-null   object 
 10  Raw Conc.      8784 non-null   float64
 11  Conc. Unit     8784 non-null   object 
 12  Duration       8784 non-null   object 
 13  QC Name        8784 non-null   object 
dtypes: float64(2), int64(5), object(7)
memory usage: 960.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8760 entries, 0 to 8759
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Site           8760 non-null   object 
 1   Parameter      8760 non-null   object 
 2   Date (LT)      8760 non-null   object 
 3   Year           8760 non-null   int64  
 4   Month          8760 non-null   int64  
 5   Day            8760 non-null   int64  
 6   Hour           8760 non-null   int64  
 7   NowCast Conc.  8760 non-null   float64
 8   AQI            8760 non-null   int64  
 9   AQI Category   8085 non-null   object 
 10  Raw Conc.      8760 non-null   float64
 11  Conc. Unit     8760 non-null   object 
 12  Duration       8760 non-null   object 
 13  QC Name        8760 non-null   object 
dtypes: float64(2), int64(5), object(7)
memory usage: 958.2+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8370 entries, 0 to 8369
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Site           8370 non-null   object 
 1   Parameter      8370 non-null   object 
 2   Date (LT)      8370 non-null   object 
 3   Year           8370 non-null   int64  
 4   Month          8370 non-null   int64  
 5   Day            8370 non-null   int64  
 6   Hour           8370 non-null   int64  
 7   NowCast Conc.  8370 non-null   float64
 8   AQI            8370 non-null   int64  
 9   AQI Category   7921 non-null   object 
 10  Raw Conc.      8370 non-null   float64
 11  Conc. Unit     8370 non-null   object 
 12  Duration       8370 non-null   object 
 13  QC Name        8370 non-null   object 
dtypes: float64(2), int64(5), object(7)
memory usage: 915.6+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8289 entries, 0 to 8288
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Site           8289 non-null   object 
 1   Parameter      8289 non-null   object 
 2   Date (LT)      8289 non-null   object 
 3   Year           8289 non-null   int64  
 4   Month          8289 non-null   int64  
 5   Day            8289 non-null   int64  
 6   Hour           8289 non-null   int64  
 7   NowCast Conc.  8289 non-null   float64
 8   AQI            8289 non-null   int64  
 9   AQI Category   8271 non-null   object 
 10  Raw Conc.      8289 non-null   float64
 11  Conc. Unit     8289 non-null   object 
 12  Duration       8289 non-null   object 
 13  QC Name        8289 non-null   object 
dtypes: float64(2), int64(5), object(7)
memory usage: 906.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8598 entries, 0 to 8597
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Site           8598 non-null   object 
 1   Parameter      8598 non-null   object 
 2   Date (LT)      8598 non-null   object 
 3   Year           8598 non-null   int64  
 4   Month          8598 non-null   int64  
 5   Day            8598 non-null   int64  
 6   Hour           8598 non-null   int64  
 7   NowCast Conc.  8598 non-null   float64
 8   AQI            8598 non-null   int64  
 9   AQI Category   8438 non-null   object 
 10  Raw Conc.      8598 non-null   float64
 11  Conc. Unit     8598 non-null   object 
 12  Duration       8598 non-null   object 
 13  QC Name        8598 non-null   object 
dtypes: float64(2), int64(5), object(7)
memory usage: 940.5+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8542 entries, 0 to 8541
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Site           8542 non-null   object 
 1   Parameter      8542 non-null   object 
 2   Date (LT)      8542 non-null   object 
 3   Year           8542 non-null   int64  
 4   Month          8542 non-null   int64  
 5   Day            8542 non-null   int64  
 6   Hour           8542 non-null   int64  
 7   NowCast Conc.  8542 non-null   float64
 8   AQI            8542 non-null   int64  
 9   AQI Category   8536 non-null   object 
 10  Raw Conc.      8542 non-null   float64
 11  Conc. Unit     8542 non-null   object 
 12  Duration       8542 non-null   object 
 13  QC Name        8542 non-null   object 
dtypes: float64(2), int64(5), object(7)
memory usage: 934.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8268 entries, 0 to 8267
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Site           8268 non-null   object 
 1   Parameter      8268 non-null   object 
 2   Date (LT)      8268 non-null   object 
 3   Year           8268 non-null   int64  
 4   Month          8268 non-null   int64  
 5   Day            8268 non-null   int64  
 6   Hour           8268 non-null   int64  
 7   NowCast Conc.  8268 non-null   float64
 8   AQI            8268 non-null   int64  
 9   AQI Category   8234 non-null   object 
 10  Raw Conc.      8268 non-null   float64
 11  Conc. Unit     8268 non-null   object 
 12  Duration       8268 non-null   object 
 13  QC Name        8268 non-null   object 
dtypes: float64(2), int64(5), object(7)
memory usage: 904.4+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3388 entries, 0 to 3387
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Site           3388 non-null   object 
 1   Parameter      3388 non-null   object 
 2   Date (LT)      3388 non-null   object 
 3   Year           3388 non-null   int64  
 4   Month          3388 non-null   int64  
 5   Day            3388 non-null   int64  
 6   Hour           3388 non-null   int64  
 7   NowCast Conc.  3388 non-null   float64
 8   AQI            3388 non-null   int64  
 9   AQI Category   3383 non-null   object 
 10  Raw Conc.      3388 non-null   float64
 11  Conc. Unit     3388 non-null   object 
 12  Duration       3388 non-null   object 
 13  QC Name        3388 non-null   object 
dtypes: float64(2), int64(5), object(7)
memory usage: 370.7+ KB
Out[144]:
(None, None, None, None, None, None, None, None, None)
In [145]:
df_2015.head(),df_2016.head(),df_2017.head(),df_2018.head(),df_2019.head(),df_2020.head(),df_2021.head(),df_2022.head(),df_2023.head()
Out[145]:
(            Site          Parameter            Date (LT)  Year  Month  Day  \
 0  Jakarta South  PM2.5 - Principal  2015-01-01 01:00 AM  2015      1    1   
 1  Jakarta South  PM2.5 - Principal  2015-01-01 02:00 AM  2015      1    1   
 2  Jakarta South  PM2.5 - Principal  2015-01-01 03:00 AM  2015      1    1   
 3  Jakarta South  PM2.5 - Principal  2015-01-01 04:00 AM  2015      1    1   
 4  Jakarta South  PM2.5 - Principal  2015-01-01 05:00 AM  2015      1    1   
 
    Hour  NowCast Conc.  AQI AQI Category  Raw Conc. Conc. Unit Duration  \
 0     1         -999.0 -999          NaN     -999.0      UG/M3     1 Hr   
 1     2         -999.0 -999          NaN     -999.0      UG/M3     1 Hr   
 2     3         -999.0 -999          NaN     -999.0      UG/M3     1 Hr   
 3     4         -999.0 -999          NaN     -999.0      UG/M3     1 Hr   
 4     5         -999.0 -999          NaN     -999.0      UG/M3     1 Hr   
 
    QC Name  
 0  Missing  
 1  Missing  
 2  Missing  
 3  Missing  
 4  Missing  ,
             Site          Parameter            Date (LT)  Year  Month  Day  \
 0  Jakarta South  PM2.5 - Principal  2016-01-01 01:00 AM  2016      1    1   
 1  Jakarta South  PM2.5 - Principal  2016-01-01 02:00 AM  2016      1    1   
 2  Jakarta South  PM2.5 - Principal  2016-01-01 03:00 AM  2016      1    1   
 3  Jakarta South  PM2.5 - Principal  2016-01-01 04:00 AM  2016      1    1   
 4  Jakarta South  PM2.5 - Principal  2016-01-01 05:00 AM  2016      1    1   
 
    Hour  NowCast Conc.  AQI                    AQI Category  Raw Conc.  \
 0     1           80.2  164                       Unhealthy       85.0   
 1     2           82.6  165                       Unhealthy       85.0   
 2     3           49.3  135  Unhealthy for Sensitive Groups       16.0   
 3     4           33.1   95                        Moderate       17.0   
 4     5           33.0   95                        Moderate       33.0   
 
   Conc. Unit Duration QC Name  
 0      UG/M3     1 Hr   Valid  
 1      UG/M3     1 Hr   Valid  
 2      UG/M3     1 Hr   Valid  
 3      UG/M3     1 Hr   Valid  
 4      UG/M3     1 Hr   Valid  ,
             Site          Parameter            Date (LT)  Year  Month  Day  \
 0  Jakarta South  PM2.5 - Principal  2017-01-01 01:00 AM  2017      1    1   
 1  Jakarta South  PM2.5 - Principal  2017-01-01 02:00 AM  2017      1    1   
 2  Jakarta South  PM2.5 - Principal  2017-01-01 03:00 AM  2017      1    1   
 3  Jakarta South  PM2.5 - Principal  2017-01-01 04:00 AM  2017      1    1   
 4  Jakarta South  PM2.5 - Principal  2017-01-01 05:00 AM  2017      1    1   
 
    Hour  NowCast Conc.  AQI                    AQI Category  Raw Conc.  \
 0     1           54.4  148  Unhealthy for Sensitive Groups      115.0   
 1     2           58.1  152                       Unhealthy       60.0   
 2     3           47.2  130  Unhealthy for Sensitive Groups       39.0   
 3     4           43.8  121  Unhealthy for Sensitive Groups       41.0   
 4     5           40.8  114  Unhealthy for Sensitive Groups       38.0   
 
   Conc. Unit Duration  QC Name  
 0      UG/M3     1 Hr  Invalid  
 1      UG/M3     1 Hr    Valid  
 2      UG/M3     1 Hr    Valid  
 3      UG/M3     1 Hr    Valid  
 4      UG/M3     1 Hr    Valid  ,
             Site          Parameter            Date (LT)  Year  Month  Day  \
 0  Jakarta South  PM2.5 - Principal  2018-01-01 01:00 AM  2018      1    1   
 1  Jakarta South  PM2.5 - Principal  2018-01-01 02:00 AM  2018      1    1   
 2  Jakarta South  PM2.5 - Principal  2018-01-01 03:00 AM  2018      1    1   
 3  Jakarta South  PM2.5 - Principal  2018-01-01 04:00 AM  2018      1    1   
 4  Jakarta South  PM2.5 - Principal  2018-01-01 05:00 AM  2018      1    1   
 
    Hour  NowCast Conc.  AQI                    AQI Category  Raw Conc.  \
 0     1           76.5  162                       Unhealthy       97.0   
 1     2           79.2  163                       Unhealthy       82.0   
 2     3           58.1  152                       Unhealthy       37.0   
 3     4           38.5  108  Unhealthy for Sensitive Groups       19.0   
 4     5           20.7   69                        Moderate        3.0   
 
   Conc. Unit Duration QC Name  
 0      UG/M3     1 Hr   Valid  
 1      UG/M3     1 Hr   Valid  
 2      UG/M3     1 Hr   Valid  
 3      UG/M3     1 Hr   Valid  
 4      UG/M3     1 Hr   Valid  ,
             Site          Parameter            Date (LT)  Year  Month  Day  \
 0  Jakarta South  PM2.5 - Principal  2019-01-01 01:00 AM  2019      1    1   
 1  Jakarta South  PM2.5 - Principal  2019-01-01 02:00 AM  2019      1    1   
 2  Jakarta South  PM2.5 - Principal  2019-01-01 03:00 AM  2019      1    1   
 3  Jakarta South  PM2.5 - Principal  2019-01-01 04:00 AM  2019      1    1   
 4  Jakarta South  PM2.5 - Principal  2019-01-01 05:00 AM  2019      1    1   
 
    Hour  NowCast Conc.  AQI AQI Category  Raw Conc. Conc. Unit Duration  \
 0     1           22.0   72     Moderate       29.0      UG/M3     1 Hr   
 1     2           11.5   48         Good        1.0      UG/M3     1 Hr   
 2     3           10.2   43         Good        9.0      UG/M3     1 Hr   
 3     4            7.6   32         Good        5.0      UG/M3     1 Hr   
 4     5            6.3   26         Good        5.0      UG/M3     1 Hr   
 
   QC Name  
 0   Valid  
 1   Valid  
 2   Valid  
 3   Valid  
 4   Valid  ,
             Site          Parameter            Date (LT)  Year  Month  Day  \
 0  Jakarta South  PM2.5 - Principal  2020-01-01 01:00 AM  2020      1    1   
 1  Jakarta South  PM2.5 - Principal  2020-01-01 02:00 AM  2020      1    1   
 2  Jakarta South  PM2.5 - Principal  2020-01-01 03:00 AM  2020      1    1   
 3  Jakarta South  PM2.5 - Principal  2020-01-01 04:00 AM  2020      1    1   
 4  Jakarta South  PM2.5 - Principal  2020-01-01 05:00 AM  2020      1    1   
 
    Hour  NowCast Conc.  AQI                    AQI Category  Raw Conc.  \
 0     1           65.3  156                       Unhealthy       59.0   
 1     2           49.6  136  Unhealthy for Sensitive Groups       34.0   
 2     3           32.3   93                        Moderate       15.0   
 3     4           24.6   77                        Moderate       17.0   
 4     5           20.3   68                        Moderate       16.0   
 
   Conc. Unit Duration QC Name  
 0      UG/M3     1 Hr   Valid  
 1      UG/M3     1 Hr   Valid  
 2      UG/M3     1 Hr   Valid  
 3      UG/M3     1 Hr   Valid  
 4      UG/M3     1 Hr   Valid  ,
             Site          Parameter            Date (LT)  Year  Month  Day  \
 0  Jakarta South  PM2.5 - Principal  2021-01-01 01:00 AM  2021      1    1   
 1  Jakarta South  PM2.5 - Principal  2021-01-01 02:00 AM  2021      1    1   
 2  Jakarta South  PM2.5 - Principal  2021-01-01 03:00 AM  2021      1    1   
 3  Jakarta South  PM2.5 - Principal  2021-01-01 04:00 AM  2021      1    1   
 4  Jakarta South  PM2.5 - Principal  2021-01-01 05:00 AM  2021      1    1   
 
    Hour  NowCast Conc.  AQI                    AQI Category  Raw Conc.  \
 0     1           43.7  121  Unhealthy for Sensitive Groups       47.0   
 1     2           38.3  108  Unhealthy for Sensitive Groups       33.0   
 2     3           32.6   94                        Moderate       27.0   
 3     4           29.3   87                        Moderate       26.0   
 4     5           22.6   73                        Moderate       16.0   
 
   Conc. Unit Duration QC Name  
 0      UG/M3     1 Hr   Valid  
 1      UG/M3     1 Hr   Valid  
 2      UG/M3     1 Hr   Valid  
 3      UG/M3     1 Hr   Valid  
 4      UG/M3     1 Hr   Valid  ,
             Site          Parameter            Date (LT)  Year  Month  Day  \
 0  Jakarta South  PM2.5 - Principal  2022-01-01 01:00 AM  2022      1    1   
 1  Jakarta South  PM2.5 - Principal  2022-01-01 02:00 AM  2022      1    1   
 2  Jakarta South  PM2.5 - Principal  2022-01-01 03:00 AM  2022      1    1   
 3  Jakarta South  PM2.5 - Principal  2022-01-01 04:00 AM  2022      1    1   
 4  Jakarta South  PM2.5 - Principal  2022-01-01 05:00 AM  2022      1    1   
 
    Hour  NowCast Conc.  AQI                    AQI Category  Raw Conc.  \
 0     1           53.7  146  Unhealthy for Sensitive Groups       69.0   
 1     2           42.3  118  Unhealthy for Sensitive Groups       31.0   
 2     3           32.1   93                        Moderate       22.0   
 3     4           20.0   68                        Moderate        8.0   
 4     5           15.0   57                        Moderate       10.0   
 
   Conc. Unit Duration QC Name  
 0      UG/M3     1 Hr   Valid  
 1      UG/M3     1 Hr   Valid  
 2      UG/M3     1 Hr   Valid  
 3      UG/M3     1 Hr   Valid  
 4      UG/M3     1 Hr   Valid  ,
             Site          Parameter            Date (LT)  Year  Month  Day  \
 0  Jakarta South  PM2.5 - Principal  2023-01-01 01:00 AM  2023      1    1   
 1  Jakarta South  PM2.5 - Principal  2023-01-01 02:00 AM  2023      1    1   
 2  Jakarta South  PM2.5 - Principal  2023-01-01 03:00 AM  2023      1    1   
 3  Jakarta South  PM2.5 - Principal  2023-01-01 04:00 AM  2023      1    1   
 4  Jakarta South  PM2.5 - Principal  2023-01-01 05:00 AM  2023      1    1   
 
    Hour  NowCast Conc.  AQI                    AQI Category  Raw Conc.  \
 0     1           44.7  124  Unhealthy for Sensitive Groups       50.0   
 1     2           29.3   87                        Moderate       14.0   
 2     3           20.1   68                        Moderate       11.0   
 3     4           16.5   60                        Moderate       13.0   
 4     5           14.2   55                        Moderate       12.0   
 
   Conc. Unit Duration QC Name  
 0      UG/M3     1 Hr   Valid  
 1      UG/M3     1 Hr   Valid  
 2      UG/M3     1 Hr   Valid  
 3      UG/M3     1 Hr   Valid  
 4      UG/M3     1 Hr   Valid  )
In [146]:
df_2015.rename(columns={'Date (LT)':'Date_LT','NowCast Conc.':'NowCast_Conc','AQI Category':'AQI_Category','Raw Conc.':'Raw_Conc','Conc. Unit':'Conc_Unit','QC Name':'QC_Name'}, inplace=True)
df_2016.rename(columns={'Date (LT)':'Date_LT','NowCast Conc.':'NowCast_Conc','AQI Category':'AQI_Category','Raw Conc.':'Raw_Conc','Conc. Unit':'Conc_Unit','QC Name':'QC_Name'}, inplace=True)
df_2017.rename(columns={'Date (LT)':'Date_LT','NowCast Conc.':'NowCast_Conc','AQI Category':'AQI_Category','Raw Conc.':'Raw_Conc','Conc. Unit':'Conc_Unit','QC Name':'QC_Name'}, inplace=True)
df_2018.rename(columns={'Date (LT)':'Date_LT','NowCast Conc.':'NowCast_Conc','AQI Category':'AQI_Category','Raw Conc.':'Raw_Conc','Conc. Unit':'Conc_Unit','QC Name':'QC_Name'}, inplace=True)
df_2019.rename(columns={'Date (LT)':'Date_LT','NowCast Conc.':'NowCast_Conc','AQI Category':'AQI_Category','Raw Conc.':'Raw_Conc','Conc. Unit':'Conc_Unit','QC Name':'QC_Name'}, inplace=True)
df_2020.rename(columns={'Date (LT)':'Date_LT','NowCast Conc.':'NowCast_Conc','AQI Category':'AQI_Category','Raw Conc.':'Raw_Conc','Conc. Unit':'Conc_Unit','QC Name':'QC_Name'}, inplace=True)
df_2021.rename(columns={'Date (LT)':'Date_LT','NowCast Conc.':'NowCast_Conc','AQI Category':'AQI_Category','Raw Conc.':'Raw_Conc','Conc. Unit':'Conc_Unit','QC Name':'QC_Name'}, inplace=True)
df_2022.rename(columns={'Date (LT)':'Date_LT','NowCast Conc.':'NowCast_Conc','AQI Category':'AQI_Category','Raw Conc.':'Raw_Conc','Conc. Unit':'Conc_Unit','QC Name':'QC_Name'}, inplace=True)
df_2023.rename(columns={'Date (LT)':'Date_LT','NowCast Conc.':'NowCast_Conc','AQI Category':'AQI_Category','Raw Conc.':'Raw_Conc','Conc. Unit':'Conc_Unit','QC Name':'QC_Name'}, inplace=True)

# Cleaning Data - Drop invalid data¶

In [147]:
#check data out-of-range
df_2015.loc[(df_2015['NowCast_Conc']<0)|(df_2015['AQI']<0)|(df_2015['Raw_Conc']<=0)].count()
Out[147]:
Site            8505
Parameter       8505
Date_LT         8505
Year            8505
Month           8505
Day             8505
Hour            8505
NowCast_Conc    8505
AQI             8505
AQI_Category       4
Raw_Conc        8505
Conc_Unit       8505
Duration        8505
QC_Name         8505
dtype: int64
In [148]:
#check data out-of-range
df_2016.loc[(df_2016['NowCast_Conc']<0)|(df_2016['AQI']<0)|(df_2016['Raw_Conc']<=0)].count()
Out[148]:
Site            127
Parameter       127
Date_LT         127
Year            127
Month           127
Day             127
Hour            127
NowCast_Conc    127
AQI             127
AQI_Category     48
Raw_Conc        127
Conc_Unit       127
Duration        127
QC_Name         127
dtype: int64
In [149]:
#check data out-of-range
df_2017.loc[(df_2017['NowCast_Conc']<0)|(df_2017['AQI']<0)|(df_2017['Raw_Conc']<=0)].count()
Out[149]:
Site            849
Parameter       849
Date_LT         849
Year            849
Month           849
Day             849
Hour            849
NowCast_Conc    849
AQI             849
AQI_Category    174
Raw_Conc        849
Conc_Unit       849
Duration        849
QC_Name         849
dtype: int64
In [150]:
#check data out-of-range
df_2018.loc[(df_2018['NowCast_Conc']<0)|(df_2018['AQI']<0)|(df_2018['Raw_Conc']<=0)].count()
Out[150]:
Site            535
Parameter       535
Date_LT         535
Year            535
Month           535
Day             535
Hour            535
NowCast_Conc    535
AQI             535
AQI_Category     86
Raw_Conc        535
Conc_Unit       535
Duration        535
QC_Name         535
dtype: int64
In [151]:
#check data out-of-range
df_2019.loc[(df_2019['NowCast_Conc']<0)|(df_2019['AQI']<0)|(df_2019['Raw_Conc']<=0)].count()
Out[151]:
Site            49
Parameter       49
Date_LT         49
Year            49
Month           49
Day             49
Hour            49
NowCast_Conc    49
AQI             49
AQI_Category    31
Raw_Conc        49
Conc_Unit       49
Duration        49
QC_Name         49
dtype: int64
In [152]:
#check data out-of-range
df_2020.loc[(df_2020['NowCast_Conc']<0)|(df_2020['AQI']<0)|(df_2020['Raw_Conc']<=0)].count()
Out[152]:
Site            210
Parameter       210
Date_LT         210
Year            210
Month           210
Day             210
Hour            210
NowCast_Conc    210
AQI             210
AQI_Category     50
Raw_Conc        210
Conc_Unit       210
Duration        210
QC_Name         210
dtype: int64
In [153]:
#check data out-of-range
df_2021.loc[(df_2021['NowCast_Conc']<0)|(df_2021['AQI']<0)|(df_2021['Raw_Conc']<=0)].count()
Out[153]:
Site            21
Parameter       21
Date_LT         21
Year            21
Month           21
Day             21
Hour            21
NowCast_Conc    21
AQI             21
AQI_Category    15
Raw_Conc        21
Conc_Unit       21
Duration        21
QC_Name         21
dtype: int64
In [154]:
#check data out-of-range
df_2022.loc[(df_2022['NowCast_Conc']<0)|(df_2022['AQI']<0)|(df_2022['Raw_Conc']<=0)].count()
Out[154]:
Site            50
Parameter       50
Date_LT         50
Year            50
Month           50
Day             50
Hour            50
NowCast_Conc    50
AQI             50
AQI_Category    16
Raw_Conc        50
Conc_Unit       50
Duration        50
QC_Name         50
dtype: int64
In [155]:
#check data out-of-range
df_2023.loc[(df_2023['NowCast_Conc']<0)|(df_2023['AQI']<0)|(df_2023['Raw_Conc']<=0)].count()
Out[155]:
Site            15
Parameter       15
Date_LT         15
Year            15
Month           15
Day             15
Hour            15
NowCast_Conc    15
AQI             15
AQI_Category    10
Raw_Conc        15
Conc_Unit       15
Duration        15
QC_Name         15
dtype: int64
In [156]:
#drop invalid values
# List of DataFrame names for each year
df_names = ['df_2015', 'df_2016', 'df_2017', 'df_2018', 'df_2019', 'df_2020', 'df_2021', 'df_2022', 'df_2023']

for df_name in df_names:
    df_year = globals()[df_name]  # Access the DataFrame by its name using globals()
    df_year = df_year.drop(df_year[(df_year['NowCast_Conc'] < 0) | (df_year['AQI'] < 0) | (df_year['Raw_Conc'] <= 0)].index)
    df_year = df_year.reset_index(drop=True)
    globals()[df_name] = df_year  # Update the DataFrame with the filtered data
In [157]:
# merging original dataset 
df_merged = pd.merge(df_2015,df_2016, how = 'outer').merge(df_2017, how = 'outer').merge(df_2018, how = 'outer').merge(df_2019, how = 'outer').merge(df_2020, how = 'outer').merge(df_2021, how = 'outer').merge(df_2022, how = 'outer').merge(df_2023, how = 'outer')
df_merged
Out[157]:
Site Parameter Date_LT Year Month Day Hour NowCast_Conc AQI AQI_Category Raw_Conc Conc_Unit Duration QC_Name
0 Jakarta South PM2.5 - Principal 2015-11-20 09:00 PM 2015 11 20 21 42.9 119 Unhealthy for Sensitive Groups 50.0 UG/M3 1 Hr Valid
1 Jakarta South PM2.5 - Principal 2015-11-20 10:00 PM 2015 11 20 22 32.3 93 Moderate 24.0 UG/M3 1 Hr Valid
2 Jakarta South PM2.5 - Principal 2015-11-20 11:00 PM 2015 11 20 23 29.5 88 Moderate 27.0 UG/M3 1 Hr Valid
3 Jakarta South PM2.5 - Principal 2015-11-21 12:00 AM 2015 11 21 0 30.8 90 Moderate 32.0 UG/M3 1 Hr Valid
4 Jakarta South PM2.5 - Principal 2015-11-21 01:00 AM 2015 11 21 1 31.4 92 Moderate 32.0 UG/M3 1 Hr Valid
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
61393 Jakarta South PM2.5 - Principal 2023-05-22 08:00 AM 2023 5 22 8 62.5 155 Unhealthy 64.0 UG/M3 1 Hr Valid
61394 Jakarta South PM2.5 - Principal 2023-05-22 09:00 AM 2023 5 22 9 55.9 151 Unhealthy 49.0 UG/M3 1 Hr Valid
61395 Jakarta South PM2.5 - Principal 2023-05-22 10:00 AM 2023 5 22 10 51.1 139 Unhealthy for Sensitive Groups 46.0 UG/M3 1 Hr Valid
61396 Jakarta South PM2.5 - Principal 2023-05-22 11:00 AM 2023 5 22 11 51.4 140 Unhealthy for Sensitive Groups 51.0 UG/M3 1 Hr Valid
61397 Jakarta South PM2.5 - Principal 2023-05-22 12:00 PM 2023 5 22 12 51.2 140 Unhealthy for Sensitive Groups 51.0 UG/M3 1 Hr Valid

61398 rows × 14 columns

#Select datapoint every 6 hour¶

In [158]:
# Dictionary to store the sampled data for each year
sampled_data = {}

# Iterate over each DataFrame
for df_name in df_names:
    df_year = globals()[df_name]  # Access the DataFrame by its name using globals()

    # Select data points at 2 AM, 8 AM, 2 PM, and 8 PM
    selected_hours = [2, 8, 14, 20]
    df_6hr = df_year[df_year['Hour'].isin(selected_hours)]

    sampled_data[df_name] = df_6hr  # Store the sampled data in the dictionary

# Access the sampled data for a specific year
df_2015_6hr = sampled_data['df_2015']
df_2016_6hr = sampled_data['df_2016']
df_2017_6hr = sampled_data['df_2017']
df_2018_6hr = sampled_data['df_2018']
df_2019_6hr = sampled_data['df_2019']
df_2020_6hr = sampled_data['df_2020']
df_2021_6hr = sampled_data['df_2021']
df_2022_6hr = sampled_data['df_2022']
df_2023_6hr = sampled_data['df_2023']

#Merge datatset at every 6 hour¶

In [159]:
df_6hour = pd.merge(df_2015_6hr,df_2016_6hr, how = 'outer').merge(df_2017_6hr, how = 'outer').merge(df_2018_6hr, how = 'outer').merge(df_2019_6hr, how = 'outer').merge(df_2020_6hr, how = 'outer').merge(df_2021_6hr, how = 'outer').merge(df_2022_6hr, how = 'outer').merge(df_2023_6hr, how = 'outer')
df_6hour
Out[159]:
Site Parameter Date_LT Year Month Day Hour NowCast_Conc AQI AQI_Category Raw_Conc Conc_Unit Duration QC_Name
0 Jakarta South PM2.5 - Principal 2015-11-21 02:00 AM 2015 11 21 2 31.7 92 Moderate 32.0 UG/M3 1 Hr Valid
1 Jakarta South PM2.5 - Principal 2015-11-21 08:00 AM 2015 11 21 8 54.2 147 Unhealthy for Sensitive Groups 58.0 UG/M3 1 Hr Valid
2 Jakarta South PM2.5 - Principal 2015-12-22 02:00 AM 2015 12 22 2 23.0 74 Moderate 23.0 UG/M3 1 Hr Valid
3 Jakarta South PM2.5 - Principal 2015-12-22 08:00 AM 2015 12 22 8 32.5 94 Moderate 34.0 UG/M3 1 Hr Valid
4 Jakarta South PM2.5 - Principal 2015-12-22 02:00 PM 2015 12 22 14 11.7 49 Good 10.0 UG/M3 1 Hr Valid
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10247 Jakarta South PM2.5 - Principal 2023-05-21 08:00 AM 2023 5 21 8 77.3 162 Unhealthy 72.0 UG/M3 1 Hr Valid
10248 Jakarta South PM2.5 - Principal 2023-05-21 02:00 PM 2023 5 21 14 48.1 132 Unhealthy for Sensitive Groups 46.0 UG/M3 1 Hr Valid
10249 Jakarta South PM2.5 - Principal 2023-05-21 08:00 PM 2023 5 21 20 39.8 112 Unhealthy for Sensitive Groups 36.0 UG/M3 1 Hr Valid
10250 Jakarta South PM2.5 - Principal 2023-05-22 02:00 AM 2023 5 22 2 39.2 110 Unhealthy for Sensitive Groups 43.0 UG/M3 1 Hr Valid
10251 Jakarta South PM2.5 - Principal 2023-05-22 08:00 AM 2023 5 22 8 62.5 155 Unhealthy 64.0 UG/M3 1 Hr Valid

10252 rows × 14 columns

In [160]:
df_6hour.head()
Out[160]:
Site Parameter Date_LT Year Month Day Hour NowCast_Conc AQI AQI_Category Raw_Conc Conc_Unit Duration QC_Name
0 Jakarta South PM2.5 - Principal 2015-11-21 02:00 AM 2015 11 21 2 31.7 92 Moderate 32.0 UG/M3 1 Hr Valid
1 Jakarta South PM2.5 - Principal 2015-11-21 08:00 AM 2015 11 21 8 54.2 147 Unhealthy for Sensitive Groups 58.0 UG/M3 1 Hr Valid
2 Jakarta South PM2.5 - Principal 2015-12-22 02:00 AM 2015 12 22 2 23.0 74 Moderate 23.0 UG/M3 1 Hr Valid
3 Jakarta South PM2.5 - Principal 2015-12-22 08:00 AM 2015 12 22 8 32.5 94 Moderate 34.0 UG/M3 1 Hr Valid
4 Jakarta South PM2.5 - Principal 2015-12-22 02:00 PM 2015 12 22 14 11.7 49 Good 10.0 UG/M3 1 Hr Valid
In [161]:
df_6hour.tail()
Out[161]:
Site Parameter Date_LT Year Month Day Hour NowCast_Conc AQI AQI_Category Raw_Conc Conc_Unit Duration QC_Name
10247 Jakarta South PM2.5 - Principal 2023-05-21 08:00 AM 2023 5 21 8 77.3 162 Unhealthy 72.0 UG/M3 1 Hr Valid
10248 Jakarta South PM2.5 - Principal 2023-05-21 02:00 PM 2023 5 21 14 48.1 132 Unhealthy for Sensitive Groups 46.0 UG/M3 1 Hr Valid
10249 Jakarta South PM2.5 - Principal 2023-05-21 08:00 PM 2023 5 21 20 39.8 112 Unhealthy for Sensitive Groups 36.0 UG/M3 1 Hr Valid
10250 Jakarta South PM2.5 - Principal 2023-05-22 02:00 AM 2023 5 22 2 39.2 110 Unhealthy for Sensitive Groups 43.0 UG/M3 1 Hr Valid
10251 Jakarta South PM2.5 - Principal 2023-05-22 08:00 AM 2023 5 22 8 62.5 155 Unhealthy 64.0 UG/M3 1 Hr Valid
In [162]:
df_6hour.info()
<class 'pandas.core.frame.DataFrame'>
Int64Index: 10252 entries, 0 to 10251
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Site          10252 non-null  object 
 1   Parameter     10252 non-null  object 
 2   Date_LT       10252 non-null  object 
 3   Year          10252 non-null  int64  
 4   Month         10252 non-null  int64  
 5   Day           10252 non-null  int64  
 6   Hour          10252 non-null  int64  
 7   NowCast_Conc  10252 non-null  float64
 8   AQI           10252 non-null  int64  
 9   AQI_Category  10252 non-null  object 
 10  Raw_Conc      10252 non-null  float64
 11  Conc_Unit     10252 non-null  object 
 12  Duration      10252 non-null  object 
 13  QC_Name       10252 non-null  object 
dtypes: float64(2), int64(5), object(7)
memory usage: 1.2+ MB
In [163]:
df_6hour.shape
Out[163]:
(10252, 14)
In [164]:
df_6hour.isnull().sum()
Out[164]:
Site            0
Parameter       0
Date_LT         0
Year            0
Month           0
Day             0
Hour            0
NowCast_Conc    0
AQI             0
AQI_Category    0
Raw_Conc        0
Conc_Unit       0
Duration        0
QC_Name         0
dtype: int64
In [165]:
from IPython.display import FileLink

# Save DataFrame to CSV file
df_6hour.to_csv(r'/Users/minhnguyetnguyen/Documents/Data_Analytics/DANA/Assignment_2/Clean_JakartaSouth_PM2.5_6HR_YTD.csv', index=False)
In [166]:
df_6hour.describe()
Out[166]:
Year Month Day Hour NowCast_Conc AQI Raw_Conc
count 10252.00000 10252.000000 10252.000000 10252.000000 10252.000000 10252.000000 10252.000000
mean 2019.20835 6.390753 15.725029 11.012876 40.687651 107.522532 41.218104
std 2.17036 3.464111 8.779137 6.715321 22.398155 39.903786 24.150764
min 2015.00000 1.000000 1.000000 2.000000 0.100000 0.000000 1.000000
25% 2017.00000 3.000000 8.000000 8.000000 23.800000 76.000000 23.000000
50% 2019.00000 6.000000 16.000000 8.000000 37.500000 106.000000 37.000000
75% 2021.00000 9.000000 23.000000 20.000000 53.600000 146.000000 55.000000
max 2023.00000 12.000000 31.000000 20.000000 182.800000 233.000000 187.000000
In [167]:
#Plot a graph to show the peak of Raw conc. 
# Group data by hour and calculate average raw concentration
hourly_avg = df_merged.groupby('Hour')['Raw_Conc'].mean()

# Create line graph
plt.plot(range(24), hourly_avg.values)
plt.xticks(range(24))
plt.xlabel('Hour of the day')
plt.ylabel('Average Raw_Conc')
plt.title('Average Raw concentration by Hour')
plt.grid(True)
plt.show()
In [168]:
# Group data by hour and calculate average raw concentration
hourly_avg = df_6hour.groupby('Hour')['Raw_Conc'].mean()

# Create line graph
plt.plot(hourly_avg.values)
plt.xticks(np.arange(4), ['2am', '8am', '14pm', '20pm'])
plt.xlabel('Every 6 Hour of the day')
plt.ylabel('Average Raw_Conc')
plt.title('Average Raw concentration by every 6 Hour')
plt.grid(True)
plt.show()

Q1. Boxplots to compare air pollution levels on a yearly basis¶

In [169]:
sns.boxplot(x=df_6hour.AQI)
plt.show()
In [170]:
# Calculate the IQR
Q1 = np.percentile(df_6hour['AQI'], 25)
Q3 = np.percentile(df_6hour['AQI'], 75)
IQR = Q3 - Q1

# Define the upper and lower bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# Count the number of outliers
outliers = df_6hour[(df_6hour['AQI'] < lower_bound) | (df_6hour['AQI'] > upper_bound)]
num_outliers = len(outliers)


print("Q1 is", Q1)
print("Q3 is", Q3)
print("lower_bound is", lower_bound)
print("upper_bound is", upper_bound)
# Print the number of outliers
print("Number of outliers for AQI:", num_outliers)
Q1 is 76.0
Q3 is 146.0
lower_bound is -29.0
upper_bound is 251.0
Number of outliers for AQI: 0
In [171]:
sns.boxplot(x=df_6hour.NowCast_Conc)
plt.show()
In [172]:
# Calculate the IQR
Q1 = np.percentile(df_6hour['NowCast_Conc'], 25)
Q3 = np.percentile(df_6hour['NowCast_Conc'], 75)
IQR = Q3 - Q1

# Define the upper and lower bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# Count the number of outliers
outliers = df_6hour[(df_6hour['NowCast_Conc'] < lower_bound) | (df_6hour['NowCast_Conc'] > upper_bound)]
num_outliers = len(outliers)

# Print the number of outliers
print("Q1 is", Q1)
print("Q3 is", Q3)
print("lower_bound is", lower_bound)
print("upper_bound is", upper_bound)
print("Number of outliers for NowCast_Conc:", num_outliers)
Q1 is 23.8
Q3 is 53.6
lower_bound is -20.900000000000002
upper_bound is 98.30000000000001
Number of outliers for NowCast_Conc: 179
In [173]:
sns.boxplot(x=df_6hour.Raw_Conc)
plt.show()
In [174]:
# Calculate the IQR
Q1 = np.percentile(df_6hour['Raw_Conc'], 25)
Q3 = np.percentile(df_6hour['Raw_Conc'], 75)
IQR = Q3 - Q1

# Define the upper and lower bounds for outliers
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
# Count the number of outliers
outliers = df_6hour[(df_6hour['Raw_Conc'] < lower_bound) | (df_6hour['Raw_Conc'] > upper_bound)]
num_outliers = len(outliers)

# Print the number of outliers
print("Q1 is", Q1)
print("Q3 is", Q3)
print("lower_bound is", lower_bound)
print("upper_bound is", upper_bound)
print("Number of outliers for Raw_Conc:", num_outliers)
Q1 is 23.0
Q3 is 55.0
lower_bound is -25.0
upper_bound is 103.0
Number of outliers for Raw_Conc: 187
In [175]:
# Create the grouped boxplot
plt.figure(figsize=(20, 6))
sns.boxplot(data=df_6hour, x='Month', y='AQI', hue='Year', palette='Set1')

# Set the plot title and axis labels
plt.title(' Boxplot of AQI by Month and Year')
plt.xlabel('Month')
plt.ylabel('AQI')

# Show the legend
plt.legend(title='Year', bbox_to_anchor=(0.5, -0.15), loc='upper center', ncol=3)

# Display the plot
plt.show()
In [176]:
# Plot boxplot of AQI values for each year
fig, ax = plt.subplots(figsize=(10, 6))
sns.boxplot(data=df_6hour, x='Year', y='AQI')

ax.set_xlabel('Year')
ax.set_ylabel('AQI')
ax.set_title('AQI Boxplot by Year')

plt.show()
In [177]:
# Plot boxplot of AQI values for each month
fig, ax = plt.subplots(figsize=(10, 6))
sns.boxplot(data=df_6hour, x='Month', y='AQI')

ax.set_xlabel('Month')
ax.set_ylabel('AQI')
ax.set_title('AQI Boxplot by Month')

plt.show()

Q2. Scatterplots to compare trends of air pollution levels on yearly basis¶

In [178]:
# Get the unique months and years
months = range(1, 13)  # 12 months
years = df_6hour['Year'].unique()  # Unique years from the 'Year' column in df_master

# Create empty lists for x_values and y_values
x_values = []
y_values = []

# Iterate over years and months
for year in years:
    for month in months:
        # Filter the DataFrame for the specific year and month
        filtered_df = df_6hour[(df_6hour['Year'] == year) & (df_6hour['Month'] == month)]
        # Get the AQI values for the filtered data
        aqi_values = filtered_df['AQI'].tolist()
        
        # Append the AQI values to y_values list
        y_values.extend(aqi_values)
        
        # Create a string representation of the month and year
        month_year = f"{month}-{year}"
        # Append the month-year string to x_values list for each AQI value in the filtered data
        x_values.extend([month_year] * len(aqi_values))
        
plt.figure(figsize=(16,8))
# Create the scatter plot using seaborn
sns.scatterplot(x=x_values, y=y_values, hue=x_values, palette='Set1', s=100)

# Set tAttributeErrorhe plot title and axis labels
plt.title('AQI Scatter Plot')
plt.xlabel('Time')
plt.ylabel('AQI')

# Rotate x-axis labels for better readability
plt.xticks(rotation=90,ha='right')
plt.legend().set_visible(False)

# Display the plot
plt.show()
In [179]:
# Group data by month and calculate average raw concentration
monthly_avg = df_6hour.groupby('Month')['Raw_Conc'].mean()

# Create line graph
plt.plot(range(12), monthly_avg.values)
plt.xticks(range(12))
plt.xlabel('Month')
plt.ylabel('Average Raw Concentration')
plt.title('Average Raw Concentration by Month')
plt.grid(True)
plt.show()
In [180]:
# Convert 'Date_LT' to datetime format
df_6hour['Date_LT'] = pd.to_datetime(df_6hour['Date_LT'])

plt.figure(figsize=(20, 6))
plt.scatter(df_6hour.index, df_6hour['NowCast_Conc'])

# Set labels and title for the plot
plt.xlabel('Time')
plt.ylabel('NowCast_Conc')
plt.title('Scatter Plot of NowCast_Conc')

# Customize x-axis tick locations and labels
plt.xticks(df_6hour.index[::500], df_6hour['Date_LT'].dt.strftime('%m-%Y')[::500], rotation=45)

# Show the plot
plt.show()
In [181]:
plt.figure(figsize=(20, 6))
plt.scatter(df_6hour.index, df_6hour['AQI'])

# Set labels and title for the plot
plt.xlabel('Time')
plt.ylabel('AQI')
plt.title('Scatter Plot of AQI')

# Customize x-axis tick locations and labels
plt.xticks(df_6hour.index[::500], df_6hour['Date_LT'].dt.strftime('%m-%Y')[::500], rotation=45)

# Show the plot
plt.show()
In [182]:
plt.figure(figsize=(20, 6))
plt.scatter(df_6hour.index, df_6hour['Raw_Conc'])

# Set labels and title for the plot
plt.xlabel('Time')
plt.ylabel('Raw_Conc')
plt.title('Scatter Plot of Raw_Conc')

# Customize x-axis tick locations and labels
plt.xticks(df_6hour.index[::500], df_6hour['Date_LT'].dt.strftime('%m-%Y')[::500], rotation=45)

# Show the plot
plt.show()
In [ ]:
 
In [183]:
# Group data by month and calculate average AQI
monthly_aqi = df_6hour.groupby('Month')['AQI'].mean()

# Create line graph
plt.plot(range(12), monthly_aqi.values)
plt.xticks(range(12))
plt.xlabel('Month')
plt.ylabel('Average AQI')
plt.title('Average AQI by Month')

# Show the plot
plt.show()

Q3. Histograms to compare the distribution of air pollution levels on yearly basis¶

In [184]:
sns.set(rc={'figure.figsize':(10,5)})
plt.xticks(fontsize=12)
p=sns.distplot(df_6hour['NowCast_Conc'],color='Red')
p.axes.set_title("NowCast_Conc Distribution",fontsize=20)
/var/folders/57/lkbzvbp92nv6k9m7r10l7d3w0000gn/T/ipykernel_23970/2453233521.py:3: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  p=sns.distplot(df_6hour['NowCast_Conc'],color='Red')
Out[184]:
Text(0.5, 1.0, 'NowCast_Conc Distribution')
In [185]:
df_6hour['NowCast_Conc'].median()
Out[185]:
37.5
In [186]:
df_6hour['NowCast_Conc'].value_counts()
Out[186]:
24.6     31
38.6     30
25.7     30
35.3     30
27.9     29
         ..
131.4     1
111.0     1
111.9     1
131.8     1
2.4       1
Name: NowCast_Conc, Length: 1083, dtype: int64
In [187]:
sns.set(rc={'figure.figsize':(10,5)})
plt.xticks(fontsize=12)
p=sns.distplot(df_6hour['AQI'],color='Red')
p.axes.set_title("AQI Values Distribution",fontsize=20)
/var/folders/57/lkbzvbp92nv6k9m7r10l7d3w0000gn/T/ipykernel_23970/2869661363.py:3: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  p=sns.distplot(df_6hour['AQI'],color='Red')
Out[187]:
Text(0.5, 1.0, 'AQI Values Distribution')
In [188]:
df_6hour['AQI'].median()
Out[188]:
106.0
In [189]:
df_6hour['AQI'].value_counts().head(15)
Out[189]:
152    210
154    176
155    174
153    171
156    162
158    138
157    130
151    122
75     113
77     111
80     108
85     104
162    103
87     101
160    100
Name: AQI, dtype: int64
In [190]:
sns.set(rc={'figure.figsize':(10,5)})
plt.xticks(fontsize=12)
p=sns.distplot(df_6hour['Raw_Conc'],color='Red')
p.axes.set_title("Raw_Conc Values Distribution",fontsize=20)
/var/folders/57/lkbzvbp92nv6k9m7r10l7d3w0000gn/T/ipykernel_23970/269364611.py:3: UserWarning: 

`distplot` is a deprecated function and will be removed in seaborn v0.14.0.

Please adapt your code to use either `displot` (a figure-level function with
similar flexibility) or `histplot` (an axes-level function for histograms).

For a guide to updating your code to use the new functions, please see
https://gist.github.com/mwaskom/de44147ed2974457ad6372750bbe5751

  p=sns.distplot(df_6hour['Raw_Conc'],color='Red')
Out[190]:
Text(0.5, 1.0, 'Raw_Conc Values Distribution')
In [191]:
df_6hour['Raw_Conc'].median()
Out[191]:
37.0
In [192]:
df_6hour['Raw_Conc'].value_counts()
Out[192]:
27.0     223
17.0     207
38.0     203
24.0     200
48.0     191
        ... 
146.0      1
160.0      1
158.0      1
175.0      1
128.0      1
Name: Raw_Conc, Length: 151, dtype: int64
In [193]:
# Histogram of AQI each month over years
# Get the unique months and years
months = range(1, 13)  # 12 months
years = df_6hour['Year'].unique()  # Unique years from the 'Year' column in df_6hour

# Set the number of subplots
num_subplots = len(years) * len(months)

# Set the number of columns and rows for subplots
num_cols = 4  # Number of columns of subplots
num_rows = -(-num_subplots // num_cols)  # Round up division

# Set the height of each subplot
subplot_height = 5

# Calculate the figure height based on the number of rows and the desired subplot height
fig_height = num_rows * subplot_height

# Create a new figure and set the size
fig, axes = plt.subplots(num_rows, num_cols, figsize=(16, fig_height))

# Flatten the axes array
axes = axes.flatten()

# Iterate over years and months
for i, (year, month) in enumerate([(year, month) for year in years for month in months]):
    # Exclude January to October 2015
    if year == 2015 and month < 11:
        continue

    # Filter the DataFrame for the specific year and month
    filtered_df = df_6hour[(df_6hour['Year'] == year) & (df_6hour['Month'] == month)]

    # Check if there is no data for the month
    if filtered_df.empty:
        continue

    # Get the AQI values for the filtered data
    aqi_values = filtered_df['AQI'].tolist()

    # Create a string representation of the month and year
    month_year = f"{month}-{year}"

    # Create a histogram of AQI values for each month by year
    sns.histplot(aqi_values, kde=True, ax=axes[i])

    # Set the plot title and axis labels
    axes[i].set_title(f"AQI Histogram - {month_year}")
    axes[i].set_xlabel("AQI")
    axes[i].set_ylabel("Frequency")

    # Remove x-axis label for all but the last row
    if i // num_cols != num_rows - 1:
        axes[i].set_xlabel("")
In [194]:
sns.histplot(df_2016_6hr['AQI'], kde=True)
Out[194]:
<Axes: xlabel='AQI', ylabel='Count'>
In [195]:
sns.histplot(df_2017_6hr['AQI'], kde=True)
Out[195]:
<Axes: xlabel='AQI', ylabel='Count'>
In [196]:
sns.histplot(df_2018_6hr['AQI'], kde=True)
Out[196]:
<Axes: xlabel='AQI', ylabel='Count'>
In [197]:
sns.histplot(df_2019_6hr['AQI'], kde=True)
Out[197]:
<Axes: xlabel='AQI', ylabel='Count'>
In [198]:
df_6hour
Out[198]:
Site Parameter Date_LT Year Month Day Hour NowCast_Conc AQI AQI_Category Raw_Conc Conc_Unit Duration QC_Name
0 Jakarta South PM2.5 - Principal 2015-11-21 02:00:00 2015 11 21 2 31.7 92 Moderate 32.0 UG/M3 1 Hr Valid
1 Jakarta South PM2.5 - Principal 2015-11-21 08:00:00 2015 11 21 8 54.2 147 Unhealthy for Sensitive Groups 58.0 UG/M3 1 Hr Valid
2 Jakarta South PM2.5 - Principal 2015-12-22 02:00:00 2015 12 22 2 23.0 74 Moderate 23.0 UG/M3 1 Hr Valid
3 Jakarta South PM2.5 - Principal 2015-12-22 08:00:00 2015 12 22 8 32.5 94 Moderate 34.0 UG/M3 1 Hr Valid
4 Jakarta South PM2.5 - Principal 2015-12-22 14:00:00 2015 12 22 14 11.7 49 Good 10.0 UG/M3 1 Hr Valid
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10247 Jakarta South PM2.5 - Principal 2023-05-21 08:00:00 2023 5 21 8 77.3 162 Unhealthy 72.0 UG/M3 1 Hr Valid
10248 Jakarta South PM2.5 - Principal 2023-05-21 14:00:00 2023 5 21 14 48.1 132 Unhealthy for Sensitive Groups 46.0 UG/M3 1 Hr Valid
10249 Jakarta South PM2.5 - Principal 2023-05-21 20:00:00 2023 5 21 20 39.8 112 Unhealthy for Sensitive Groups 36.0 UG/M3 1 Hr Valid
10250 Jakarta South PM2.5 - Principal 2023-05-22 02:00:00 2023 5 22 2 39.2 110 Unhealthy for Sensitive Groups 43.0 UG/M3 1 Hr Valid
10251 Jakarta South PM2.5 - Principal 2023-05-22 08:00:00 2023 5 22 8 62.5 155 Unhealthy 64.0 UG/M3 1 Hr Valid

10252 rows × 14 columns

In [199]:
plt.figure(figsize=(15, 6), dpi=100)
plt.grid()

# Extract the month from the 'Date_LT' column
df_6hour['Month'] = pd.to_datetime(df_6hour['Date_LT']).dt.month

# Plot the line graph
sns.lineplot(data=df_6hour, x='Month', y='AQI', hue='Year', ci=0, palette='Set1')

# Set the x-axis tick labels
plt.xticks(range(1, 13), ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'])

# Set the title
plt.title('AQI Monthly Trend (by Year)')

# Show the plot
plt.show()
/var/folders/57/lkbzvbp92nv6k9m7r10l7d3w0000gn/T/ipykernel_23970/4083817283.py:8: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=('ci', 0)` for the same effect.

  sns.lineplot(data=df_6hour, x='Month', y='AQI', hue='Year', ci=0, palette='Set1')
In [200]:
# Group the data by year and calculate the mean AQI
average_aqi_by_year = df_6hour.groupby(df_6hour['Year'])['AQI'].mean()

# Plot the graph
plt.plot(average_aqi_by_year.index, average_aqi_by_year.values, marker='o')

# Set the x-axis label
plt.xlabel('Year')

# Set the y-axis label
plt.ylabel('Average AQI')

# Set the title
plt.title('Trend of AQI over Years')

# Show the plot
plt.show()
In [201]:
import statsmodels.formula.api as smf            
import statsmodels.tsa.api as smt
import statsmodels.api as sm
from statsmodels.tsa.seasonal import seasonal_decompose

jakarta_decompose = seasonal_decompose(df_6hour['AQI'], period=12)
sns.lineplot(x=df_6hour['Date_LT'], y=jakarta_decompose.trend)
Out[201]:
<Axes: xlabel='Date_LT', ylabel='trend'>
In [202]:
jakarta_decompose2 = seasonal_decompose(df_6hour['NowCast_Conc'], period=12)

sns.lineplot(x=df_6hour['Date_LT'], y=jakarta_decompose2.trend)
Out[202]:
<Axes: xlabel='Date_LT', ylabel='trend'>
In [203]:
jakarta_decompose3 = seasonal_decompose(df_6hour['Raw_Conc'], period=12)

sns.lineplot(x=df_6hour['Date_LT'], y=jakarta_decompose3.trend)
Out[203]:
<Axes: xlabel='Date_LT', ylabel='trend'>
In [204]:
# Group the data by 'AQI_Category' and 'Year' and calculate the count
grouped_data = df_6hour.groupby(['AQI_Category', 'Year']).size().unstack()

# Plot the grouped bar chart
grouped_data.plot(kind='bar', rot=0)

# Set the y-axis label and chart title
plt.ylabel('Count')
plt.title('Grouped Bar Chart of AQI_Category by Year')
plt.xticks(rotation=45)
# Show the plot
plt.show()
In [205]:
# Extract year, month, and day from the 'Day' column
df_6hour['Year'] = df_6hour['Date_LT'].dt.year
df_6hour['Month'] = df_6hour['Date_LT'].dt.month
df_6hour['Day'] = df_6hour['Date_LT'].dt.day

# Create the pivot table to calculate the average AQI
pivot_table_6hour = pd.pivot_table(df_6hour, values='AQI', index=['Year', 'Month', 'Day'], aggfunc='mean').reset_index()

# Rename the column to 'Average AQI'
pivot_table_6hour.rename(columns={'AQI': 'Average AQI'}, inplace=True)

# Print the resulting dataset
print(pivot_table_6hour)
      Year  Month  Day  Average AQI
0     2015     11   21       119.50
1     2015     12   22        69.75
2     2015     12   23        81.75
3     2015     12   24        80.00
4     2015     12   25        98.75
...    ...    ...  ...          ...
2609  2023      5   18       132.25
2610  2023      5   19       146.75
2611  2023      5   20       126.00
2612  2023      5   21       141.00
2613  2023      5   22       132.50

[2614 rows x 4 columns]
In [206]:
# Group the data by year and classify days as unhealthy or healthy
unhealthy_threshold = 100  # Adjust this threshold as needed
pivot_table_6hour['Day Classification'] = pivot_table_6hour['Average AQI'].apply(lambda x: 'Unhealthy' if x > unhealthy_threshold else 'Healthy')
day_classification = pivot_table_6hour.groupby(['Year', 'Day Classification']).size().unstack().fillna(0)

# Plot the graph
ax = day_classification.plot(kind='bar', width=0.8)

# Set the x-axis label
ax.set_xlabel('Year')

# Set the y-axis label
ax.set_ylabel('Number of Days')

# Set the title
ax.set_title('Number of Unhealthy and Healthy Days over Years')

# Set the x-tick labels
ax.set_xticklabels(day_classification.index, rotation=45)

# Create a legend
ax.legend(['Healthy', 'Unhealthy'])

# Display the data values for each bar
for container in ax.containers:
    ax.bar_label(container, fmt='%d', label_type='edge', fontsize=8)

# Show the plot
plt.show()

#Season analysis¶

In [207]:
season_jakarta_dict = {1: 'Wet', 2: 'Wet', 3: 'Wet',4: 'Wet',5: 'Wet',6: 'Dry',
                       7: 'Dry', 8: 'Dry', 9: 'Dry',10: 'Dry',11: 'Wet',12: 'Wet'}
df_6hour['Season'] = df_6hour['Month'].map(season_jakarta_dict)
df_6hour
Out[207]:
Site Parameter Date_LT Year Month Day Hour NowCast_Conc AQI AQI_Category Raw_Conc Conc_Unit Duration QC_Name Season
0 Jakarta South PM2.5 - Principal 2015-11-21 02:00:00 2015 11 21 2 31.7 92 Moderate 32.0 UG/M3 1 Hr Valid Wet
1 Jakarta South PM2.5 - Principal 2015-11-21 08:00:00 2015 11 21 8 54.2 147 Unhealthy for Sensitive Groups 58.0 UG/M3 1 Hr Valid Wet
2 Jakarta South PM2.5 - Principal 2015-12-22 02:00:00 2015 12 22 2 23.0 74 Moderate 23.0 UG/M3 1 Hr Valid Wet
3 Jakarta South PM2.5 - Principal 2015-12-22 08:00:00 2015 12 22 8 32.5 94 Moderate 34.0 UG/M3 1 Hr Valid Wet
4 Jakarta South PM2.5 - Principal 2015-12-22 14:00:00 2015 12 22 14 11.7 49 Good 10.0 UG/M3 1 Hr Valid Wet
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
10247 Jakarta South PM2.5 - Principal 2023-05-21 08:00:00 2023 5 21 8 77.3 162 Unhealthy 72.0 UG/M3 1 Hr Valid Wet
10248 Jakarta South PM2.5 - Principal 2023-05-21 14:00:00 2023 5 21 14 48.1 132 Unhealthy for Sensitive Groups 46.0 UG/M3 1 Hr Valid Wet
10249 Jakarta South PM2.5 - Principal 2023-05-21 20:00:00 2023 5 21 20 39.8 112 Unhealthy for Sensitive Groups 36.0 UG/M3 1 Hr Valid Wet
10250 Jakarta South PM2.5 - Principal 2023-05-22 02:00:00 2023 5 22 2 39.2 110 Unhealthy for Sensitive Groups 43.0 UG/M3 1 Hr Valid Wet
10251 Jakarta South PM2.5 - Principal 2023-05-22 08:00:00 2023 5 22 8 62.5 155 Unhealthy 64.0 UG/M3 1 Hr Valid Wet

10252 rows × 15 columns

In [208]:
plt.figure(figsize=(15,6), dpi=100)

sns.lineplot(data=df_6hour, x='Date_LT', y='AQI',hue=df_6hour['Season'],palette='Set2');
In [209]:
plt.figure(figsize=(15, 6), dpi=100)

# Perform seasonal decomposition
jakarta_decompose = seasonal_decompose(df_6hour['AQI'], period=12)

# Plot the trend component
sns.lineplot(x=df_6hour['Date_LT'], y=jakarta_decompose.trend, label='Trend')

# Split the plot by season using different colors
seasons = df_6hour['Season'].unique()
colors = sns.color_palette('Set2', len(seasons))

for i, season in enumerate(seasons):
    season_data = df_6hour[df_6hour['Season'] == season]
    sns.lineplot(x=season_data['Date_LT'], y=jakarta_decompose.trend.loc[season_data.index], color=colors[i], label=season)

# Set the plot title and axis labels
plt.title('AQI Trend by Season')
plt.xlabel('Date')
plt.ylabel('AQI')

# Display the legend
plt.legend()

# Show the plot
plt.show()
In [210]:
season_year_mean_aqi = df_6hour.groupby(['Season', 'Year'])['AQI'].mean().unstack()

plt.figure(figsize=(10, 6))
sns.heatmap(season_year_mean_aqi, cmap='YlOrRd', annot=True, fmt='.2f', cbar=True)
plt.xlabel('Year')
plt.ylabel('Season')
plt.title('Mean AQI for Each Season Over Years in Jakarta South')
plt.show()

Weekly AQI analysis¶

In [211]:
# Extract day of the week
df_6hour['DayOfWeek'] = df_6hour['Date_LT'].dt.dayofweek

# Group data by day of the week and calculate average AQI
avg_rawconc_by_day = df_6hour.groupby('DayOfWeek')['Raw_Conc'].mean().reset_index()

# Define the order of days of the week
days_of_week = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

# Plot AQI values by days of the week
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(days_of_week, avg_rawconc_by_day['Raw_Conc'], marker='o')

ax.set_xlabel('Day of the Week')
ax.set_ylabel('Average Raw_Conc')
ax.set_title('Raw_Conc Value by Days of the Week')

plt.show()
In [212]:
# Extract day of the week
df_6hour['DayOfWeek'] = df_6hour['Date_LT'].dt.dayofweek

# Group data by day of the week and calculate average AQI
avg_aqi_by_day = df_6hour.groupby('DayOfWeek')['AQI'].mean().reset_index()

# Define the order of days of the week
days_of_week = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

# Plot AQI values by days of the week
fig, ax = plt.subplots(figsize=(10, 6))
ax.plot(days_of_week, avg_aqi_by_day['AQI'], marker='o')

ax.set_xlabel('Day of the Week')
ax.set_ylabel('Average AQI')
ax.set_title('AQI Value by Days of the Week')

plt.show()

Q4 Systematic sampling¶

In [213]:
df_sampled = pd.DataFrame()
k = 4  # Number of records to sample per month per year

for year in df_6hour['Year'].unique():
    for month in df_6hour['Month'].unique():
    
        df_month = df_6hour[(df_6hour['Year'] == year) & (df_6hour['Month'] == month)]
        
        if len(df_month) < k:
            continue
        
        step = len(df_month) // k  # Calculate the step size
        
        # Select every kth record using systematic sampling
        sampled = df_month.iloc[::step][:k]
        
        df_sampled = pd.concat([df_sampled, sampled])

df_sampled.reset_index(drop=True, inplace=True)
df_sampled.drop(columns=['DayOfWeek', 'Season'], inplace=True)
df_sampled
Out[213]:
Site Parameter Date_LT Year Month Day Hour NowCast_Conc AQI AQI_Category Raw_Conc Conc_Unit Duration QC_Name
0 Jakarta South PM2.5 - Principal 2015-12-22 02:00:00 2015 12 22 2 23.0 74 Moderate 23.0 UG/M3 1 Hr Valid
1 Jakarta South PM2.5 - Principal 2015-12-24 14:00:00 2015 12 24 14 11.0 46 Good 8.0 UG/M3 1 Hr Valid
2 Jakarta South PM2.5 - Principal 2015-12-27 02:00:00 2015 12 27 2 36.2 103 Unhealthy for Sensitive Groups 50.0 UG/M3 1 Hr Valid
3 Jakarta South PM2.5 - Principal 2015-12-29 14:00:00 2015 12 29 14 21.5 71 Moderate 16.0 UG/M3 1 Hr Valid
4 Jakarta South PM2.5 - Principal 2016-11-01 02:00:00 2016 11 1 2 19.4 66 Moderate 13.0 UG/M3 1 Hr Valid
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
355 Jakarta South PM2.5 - Principal 2023-04-23 02:00:00 2023 4 23 2 36.4 103 Unhealthy for Sensitive Groups 43.0 UG/M3 1 Hr Valid
356 Jakarta South PM2.5 - Principal 2023-05-01 02:00:00 2023 5 1 2 23.8 76 Moderate 26.0 UG/M3 1 Hr Valid
357 Jakarta South PM2.5 - Principal 2023-05-06 08:00:00 2023 5 6 8 28.5 85 Moderate 33.0 UG/M3 1 Hr Valid
358 Jakarta South PM2.5 - Principal 2023-05-11 14:00:00 2023 5 11 14 15.0 57 Moderate 10.0 UG/M3 1 Hr Valid
359 Jakarta South PM2.5 - Principal 2023-05-16 20:00:00 2023 5 16 20 26.0 80 Moderate 25.0 UG/M3 1 Hr Valid

360 rows × 14 columns

In [214]:
# Save DataFrame to CSV file
df_sampled.to_csv(r'/Users/minhnguyetnguyen/Documents/Data_Analytics/DANA/Assignment_2/Sampled_JakartaSouth_PM2.5_6HR_YTD.csv', index=False)
In [215]:
df_sampled['Year'].unique()
Out[215]:
array([2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023])
In [216]:
df_sampled['Hour'].unique()
Out[216]:
array([ 2, 14, 20,  8])
In [217]:
df_sampled['Month'].unique()
Out[217]:
array([12, 11,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10])
In [218]:
df_sampled.describe()
Out[218]:
Year Month Day Hour NowCast_Conc AQI Raw_Conc
count 360.000000 360.000000 360.000000 360.000000 360.000000 360.000000 360.000000
mean 2019.177778 6.366667 12.122222 9.433333 42.200556 109.786111 42.569444
std 2.186705 3.502327 8.422450 6.846588 24.007838 40.773050 26.075435
min 2015.000000 1.000000 1.000000 2.000000 3.200000 13.000000 1.000000
25% 2017.000000 3.000000 5.000000 2.000000 23.750000 75.750000 23.000000
50% 2019.000000 6.000000 11.500000 8.000000 39.000000 110.000000 39.500000
75% 2021.000000 9.000000 20.000000 14.000000 56.000000 151.000000 57.000000
max 2023.000000 12.000000 29.000000 20.000000 127.200000 188.000000 146.000000

Q5. AQI Category comparison among all years.¶

In [219]:
plt.figure(figsize=(12, 6))
sns.countplot(data=df_sampled, x='AQI_Category', hue='Year', palette='Set1')
plt.xlabel('AQI Category')
plt.ylabel('Count')
plt.title('AQI Category Comparison Among All Years')
plt.legend(title='Year')
plt.show()
In [220]:
plt.figure(figsize=(12, 6))
sns.countplot(data=df_sampled, x='Month', hue='AQI_Category', palette='Set1')
plt.xlabel('Month')
plt.ylabel('Count')
plt.title('AQI Category Comparison of Each Month Over Years')
plt.legend(title='AQI Category')
plt.show()

Q6. Air quality (AQI) correlation among all years.¶

In [221]:
# Create a pivot table to reshape the data
df_pivot = df_sampled.pivot_table(index=['Month', 'Day', 'Hour'], columns='Year', values='AQI', aggfunc=lambda x: x)

# Rename the columns to include a prefix or suffix indicating the year
df_pivot.columns = ['AQI_' + str(year) for year in df_pivot.columns]

# Reset the index and fill NaN values with blank
df_new = df_pivot.reset_index()

# Print the new DataFrame
df_new
Out[221]:
Month Day Hour AQI_2015 AQI_2016 AQI_2017 AQI_2018 AQI_2019 AQI_2020 AQI_2021 AQI_2022 AQI_2023
0 1 1 2 NaN 165.0 152.0 163.0 48.0 136.0 108.0 118.0 87.0
1 1 5 8 NaN NaN NaN 56.0 NaN NaN NaN NaN NaN
2 1 7 8 NaN NaN NaN NaN 143.0 NaN NaN NaN NaN
3 1 8 20 NaN 63.0 92.0 NaN NaN 162.0 81.0 NaN 65.0
4 1 9 8 NaN NaN NaN NaN NaN NaN NaN 101.0 NaN
... ... ... ... ... ... ... ... ... ... ... ... ...
170 12 24 8 NaN NaN NaN 43.0 NaN 76.0 80.0 149.0 NaN
171 12 24 14 46.0 NaN NaN NaN NaN NaN NaN NaN NaN
172 12 26 8 NaN NaN 71.0 NaN NaN NaN NaN NaN NaN
173 12 27 2 103.0 NaN NaN NaN NaN NaN NaN NaN NaN
174 12 29 14 71.0 NaN NaN NaN NaN NaN NaN NaN NaN

175 rows × 12 columns

In [222]:
df_new.describe()
Out[222]:
Month Day Hour AQI_2015 AQI_2016 AQI_2017 AQI_2018 AQI_2019 AQI_2020 AQI_2021 AQI_2022 AQI_2023
count 175.000000 175.000000 175.000000 4.000000 48.000000 48.000000 48.000000 48.000000 48.000000 48.000000 48.000000 20.000000
mean 6.451429 14.274286 10.640000 73.500000 112.958333 92.062500 122.979167 131.708333 123.166667 102.812500 99.229167 77.650000
std 3.517883 7.537963 6.671401 23.330952 39.009523 41.764999 42.396050 36.030105 37.972181 40.998005 32.390726 23.978664
min 1.000000 1.000000 2.000000 46.000000 52.000000 18.000000 43.000000 36.000000 35.000000 13.000000 29.000000 51.000000
25% 3.500000 8.000000 2.000000 64.750000 79.000000 60.750000 87.500000 109.500000 88.500000 74.000000 74.000000 63.500000
50% 6.000000 15.000000 8.000000 72.500000 110.000000 85.500000 135.500000 147.000000 131.500000 104.000000 98.000000 72.000000
75% 10.000000 22.000000 14.000000 81.250000 152.000000 120.750000 160.500000 155.250000 151.250000 134.500000 123.000000 85.500000
max 12.000000 29.000000 20.000000 103.000000 188.000000 187.000000 182.000000 183.000000 181.000000 187.000000 160.000000 158.000000
In [223]:
df_2015_new = df_new[["Month","AQI_2015"]].dropna().reset_index(drop=True)
df_2015_new
Out[223]:
Month AQI_2015
0 12 74.0
1 12 46.0
2 12 103.0
3 12 71.0
In [224]:
# Filter other AQI columns and create new dataframes
df_2016_new = df_new[["Month", "AQI_2016"]].dropna().reset_index(drop=True)
df_2017_new = df_new[["Month", "AQI_2017"]].dropna().reset_index(drop=True)
df_2018_new = df_new[["Month", "AQI_2018"]].dropna().reset_index(drop=True)
df_2019_new = df_new[["Month", "AQI_2019"]].dropna().reset_index(drop=True)
df_2020_new = df_new[["Month", "AQI_2020"]].dropna().reset_index(drop=True)
df_2021_new = df_new[["Month", "AQI_2021"]].dropna().reset_index(drop=True)
df_2022_new = df_new[["Month", "AQI_2022"]].dropna().reset_index(drop=True)
df_2023_new = df_new[["Month", "AQI_2023"]].dropna().reset_index(drop=True)
In [225]:
# Reset index of df_2015_new starting from 44
df_2015_new = df_2015_new.reset_index(drop=True)
df_2015_new.index = df_2015_new.index + 44

df_2015_new
Out[225]:
Month AQI_2015
44 12 74.0
45 12 46.0
46 12 103.0
47 12 71.0
In [226]:
df_2023_new
Out[226]:
Month AQI_2023
0 1 87.0
1 1 65.0
2 1 71.0
3 1 51.0
4 2 80.0
5 2 68.0
6 2 54.0
7 2 59.0
8 3 72.0
9 3 100.0
10 3 72.0
11 3 158.0
12 4 69.0
13 4 91.0
14 4 55.0
15 4 103.0
16 5 76.0
17 5 85.0
18 5 57.0
19 5 80.0
In [227]:
# Concatenate all 9 datasets by index
df_concatenated = pd.concat([df_2015_new['AQI_2015'], df_2016_new, df_2017_new["AQI_2017"], df_2018_new["AQI_2018"], df_2019_new["AQI_2019"], df_2020_new["AQI_2020"], df_2021_new["AQI_2021"], df_2022_new["AQI_2022"], df_2023_new["AQI_2023"]], axis=1)

# Display the concatenated dataframe
df_concatenated
Out[227]:
AQI_2015 Month AQI_2016 AQI_2017 AQI_2018 AQI_2019 AQI_2020 AQI_2021 AQI_2022 AQI_2023
44 74.0 12 140.0 43.0 153.0 145.0 53.0 92.0 126.0 NaN
45 46.0 12 84.0 40.0 116.0 161.0 59.0 71.0 81.0 NaN
46 103.0 12 66.0 23.0 58.0 154.0 35.0 110.0 85.0 NaN
47 71.0 12 52.0 71.0 43.0 148.0 76.0 80.0 149.0 NaN
0 NaN 1 165.0 152.0 163.0 48.0 136.0 108.0 118.0 87.0
1 NaN 1 63.0 92.0 56.0 143.0 162.0 81.0 101.0 65.0
2 NaN 1 162.0 34.0 49.0 99.0 111.0 47.0 77.0 71.0
3 NaN 1 79.0 86.0 68.0 40.0 145.0 75.0 71.0 51.0
4 NaN 2 74.0 71.0 136.0 140.0 117.0 174.0 74.0 80.0
5 NaN 2 90.0 51.0 111.0 107.0 57.0 43.0 51.0 68.0
6 NaN 2 79.0 42.0 64.0 108.0 126.0 58.0 29.0 54.0
7 NaN 2 85.0 99.0 149.0 77.0 84.0 142.0 77.0 59.0
8 NaN 3 64.0 107.0 110.0 151.0 85.0 132.0 63.0 72.0
9 NaN 3 73.0 68.0 65.0 120.0 125.0 63.0 64.0 100.0
10 NaN 3 152.0 83.0 159.0 88.0 151.0 118.0 108.0 72.0
11 NaN 3 84.0 114.0 167.0 36.0 152.0 60.0 89.0 158.0
12 NaN 4 188.0 28.0 70.0 140.0 146.0 17.0 122.0 69.0
13 NaN 4 85.0 56.0 70.0 91.0 66.0 70.0 67.0 91.0
14 NaN 4 164.0 81.0 141.0 61.0 87.0 129.0 111.0 55.0
15 NaN 4 77.0 62.0 80.0 154.0 99.0 149.0 80.0 103.0
16 NaN 5 115.0 153.0 166.0 133.0 146.0 153.0 160.0 76.0
17 NaN 5 160.0 98.0 110.0 151.0 149.0 139.0 91.0 85.0
18 NaN 5 120.0 130.0 153.0 146.0 162.0 105.0 94.0 57.0
19 NaN 5 151.0 156.0 63.0 162.0 105.0 122.0 100.0 80.0
20 NaN 6 166.0 44.0 175.0 155.0 157.0 120.0 98.0 NaN
21 NaN 6 80.0 152.0 162.0 119.0 171.0 154.0 66.0 NaN
22 NaN 6 154.0 98.0 156.0 102.0 160.0 165.0 99.0 NaN
23 NaN 6 64.0 187.0 93.0 167.0 151.0 187.0 74.0 NaN
24 NaN 7 162.0 79.0 172.0 153.0 150.0 97.0 145.0 NaN
25 NaN 7 101.0 144.0 151.0 152.0 149.0 157.0 99.0 NaN
26 NaN 7 110.0 130.0 160.0 154.0 180.0 133.0 59.0 NaN
27 NaN 7 85.0 85.0 175.0 155.0 154.0 106.0 98.0 NaN
28 NaN 8 128.0 72.0 173.0 152.0 181.0 70.0 152.0 NaN
29 NaN 8 105.0 112.0 140.0 111.0 77.0 85.0 82.0 NaN
30 NaN 8 110.0 161.0 163.0 175.0 89.0 78.0 138.0 NaN
31 NaN 8 129.0 135.0 135.0 156.0 171.0 141.0 135.0 NaN
32 NaN 9 151.0 108.0 164.0 157.0 134.0 153.0 153.0 NaN
33 NaN 9 152.0 51.0 95.0 158.0 85.0 109.0 110.0 NaN
34 NaN 9 168.0 152.0 111.0 130.0 136.0 75.0 121.0 NaN
35 NaN 9 73.0 123.0 173.0 158.0 155.0 79.0 130.0 NaN
36 NaN 10 137.0 118.0 132.0 183.0 116.0 103.0 154.0 NaN
37 NaN 10 56.0 62.0 90.0 156.0 126.0 126.0 86.0 NaN
38 NaN 10 80.0 85.0 152.0 143.0 129.0 91.0 110.0 NaN
39 NaN 10 136.0 110.0 95.0 151.0 120.0 90.0 128.0 NaN
40 NaN 11 66.0 57.0 182.0 92.0 108.0 152.0 152.0 NaN
41 NaN 11 155.0 18.0 130.0 162.0 151.0 59.0 70.0 NaN
42 NaN 11 156.0 120.0 67.0 110.0 66.0 54.0 53.0 NaN
43 NaN 11 126.0 76.0 137.0 168.0 162.0 13.0 63.0 NaN
In [228]:
# Get the "Month" column
month_column = df_concatenated["Month"]

# Remove the "Month" column from the dataframe
df_concatenated = df_concatenated.drop("Month",axis=1)

# Insert the "Month" column at the first position
df_concatenated.insert(0, "Month", month_column)
# Reset the index
df_concatenated = df_concatenated.reset_index(drop=True)

# Display the updated dataframe
df_concatenated
Out[228]:
Month AQI_2015 AQI_2016 AQI_2017 AQI_2018 AQI_2019 AQI_2020 AQI_2021 AQI_2022 AQI_2023
0 12 74.0 140.0 43.0 153.0 145.0 53.0 92.0 126.0 NaN
1 12 46.0 84.0 40.0 116.0 161.0 59.0 71.0 81.0 NaN
2 12 103.0 66.0 23.0 58.0 154.0 35.0 110.0 85.0 NaN
3 12 71.0 52.0 71.0 43.0 148.0 76.0 80.0 149.0 NaN
4 1 NaN 165.0 152.0 163.0 48.0 136.0 108.0 118.0 87.0
5 1 NaN 63.0 92.0 56.0 143.0 162.0 81.0 101.0 65.0
6 1 NaN 162.0 34.0 49.0 99.0 111.0 47.0 77.0 71.0
7 1 NaN 79.0 86.0 68.0 40.0 145.0 75.0 71.0 51.0
8 2 NaN 74.0 71.0 136.0 140.0 117.0 174.0 74.0 80.0
9 2 NaN 90.0 51.0 111.0 107.0 57.0 43.0 51.0 68.0
10 2 NaN 79.0 42.0 64.0 108.0 126.0 58.0 29.0 54.0
11 2 NaN 85.0 99.0 149.0 77.0 84.0 142.0 77.0 59.0
12 3 NaN 64.0 107.0 110.0 151.0 85.0 132.0 63.0 72.0
13 3 NaN 73.0 68.0 65.0 120.0 125.0 63.0 64.0 100.0
14 3 NaN 152.0 83.0 159.0 88.0 151.0 118.0 108.0 72.0
15 3 NaN 84.0 114.0 167.0 36.0 152.0 60.0 89.0 158.0
16 4 NaN 188.0 28.0 70.0 140.0 146.0 17.0 122.0 69.0
17 4 NaN 85.0 56.0 70.0 91.0 66.0 70.0 67.0 91.0
18 4 NaN 164.0 81.0 141.0 61.0 87.0 129.0 111.0 55.0
19 4 NaN 77.0 62.0 80.0 154.0 99.0 149.0 80.0 103.0
20 5 NaN 115.0 153.0 166.0 133.0 146.0 153.0 160.0 76.0
21 5 NaN 160.0 98.0 110.0 151.0 149.0 139.0 91.0 85.0
22 5 NaN 120.0 130.0 153.0 146.0 162.0 105.0 94.0 57.0
23 5 NaN 151.0 156.0 63.0 162.0 105.0 122.0 100.0 80.0
24 6 NaN 166.0 44.0 175.0 155.0 157.0 120.0 98.0 NaN
25 6 NaN 80.0 152.0 162.0 119.0 171.0 154.0 66.0 NaN
26 6 NaN 154.0 98.0 156.0 102.0 160.0 165.0 99.0 NaN
27 6 NaN 64.0 187.0 93.0 167.0 151.0 187.0 74.0 NaN
28 7 NaN 162.0 79.0 172.0 153.0 150.0 97.0 145.0 NaN
29 7 NaN 101.0 144.0 151.0 152.0 149.0 157.0 99.0 NaN
30 7 NaN 110.0 130.0 160.0 154.0 180.0 133.0 59.0 NaN
31 7 NaN 85.0 85.0 175.0 155.0 154.0 106.0 98.0 NaN
32 8 NaN 128.0 72.0 173.0 152.0 181.0 70.0 152.0 NaN
33 8 NaN 105.0 112.0 140.0 111.0 77.0 85.0 82.0 NaN
34 8 NaN 110.0 161.0 163.0 175.0 89.0 78.0 138.0 NaN
35 8 NaN 129.0 135.0 135.0 156.0 171.0 141.0 135.0 NaN
36 9 NaN 151.0 108.0 164.0 157.0 134.0 153.0 153.0 NaN
37 9 NaN 152.0 51.0 95.0 158.0 85.0 109.0 110.0 NaN
38 9 NaN 168.0 152.0 111.0 130.0 136.0 75.0 121.0 NaN
39 9 NaN 73.0 123.0 173.0 158.0 155.0 79.0 130.0 NaN
40 10 NaN 137.0 118.0 132.0 183.0 116.0 103.0 154.0 NaN
41 10 NaN 56.0 62.0 90.0 156.0 126.0 126.0 86.0 NaN
42 10 NaN 80.0 85.0 152.0 143.0 129.0 91.0 110.0 NaN
43 10 NaN 136.0 110.0 95.0 151.0 120.0 90.0 128.0 NaN
44 11 NaN 66.0 57.0 182.0 92.0 108.0 152.0 152.0 NaN
45 11 NaN 155.0 18.0 130.0 162.0 151.0 59.0 70.0 NaN
46 11 NaN 156.0 120.0 67.0 110.0 66.0 54.0 53.0 NaN
47 11 NaN 126.0 76.0 137.0 168.0 162.0 13.0 63.0 NaN
In [229]:
# Replace NaN values with 0
df_concatenated = df_concatenated.fillna(0)
df_concatenated
Out[229]:
Month AQI_2015 AQI_2016 AQI_2017 AQI_2018 AQI_2019 AQI_2020 AQI_2021 AQI_2022 AQI_2023
0 12 74.0 140.0 43.0 153.0 145.0 53.0 92.0 126.0 0.0
1 12 46.0 84.0 40.0 116.0 161.0 59.0 71.0 81.0 0.0
2 12 103.0 66.0 23.0 58.0 154.0 35.0 110.0 85.0 0.0
3 12 71.0 52.0 71.0 43.0 148.0 76.0 80.0 149.0 0.0
4 1 0.0 165.0 152.0 163.0 48.0 136.0 108.0 118.0 87.0
5 1 0.0 63.0 92.0 56.0 143.0 162.0 81.0 101.0 65.0
6 1 0.0 162.0 34.0 49.0 99.0 111.0 47.0 77.0 71.0
7 1 0.0 79.0 86.0 68.0 40.0 145.0 75.0 71.0 51.0
8 2 0.0 74.0 71.0 136.0 140.0 117.0 174.0 74.0 80.0
9 2 0.0 90.0 51.0 111.0 107.0 57.0 43.0 51.0 68.0
10 2 0.0 79.0 42.0 64.0 108.0 126.0 58.0 29.0 54.0
11 2 0.0 85.0 99.0 149.0 77.0 84.0 142.0 77.0 59.0
12 3 0.0 64.0 107.0 110.0 151.0 85.0 132.0 63.0 72.0
13 3 0.0 73.0 68.0 65.0 120.0 125.0 63.0 64.0 100.0
14 3 0.0 152.0 83.0 159.0 88.0 151.0 118.0 108.0 72.0
15 3 0.0 84.0 114.0 167.0 36.0 152.0 60.0 89.0 158.0
16 4 0.0 188.0 28.0 70.0 140.0 146.0 17.0 122.0 69.0
17 4 0.0 85.0 56.0 70.0 91.0 66.0 70.0 67.0 91.0
18 4 0.0 164.0 81.0 141.0 61.0 87.0 129.0 111.0 55.0
19 4 0.0 77.0 62.0 80.0 154.0 99.0 149.0 80.0 103.0
20 5 0.0 115.0 153.0 166.0 133.0 146.0 153.0 160.0 76.0
21 5 0.0 160.0 98.0 110.0 151.0 149.0 139.0 91.0 85.0
22 5 0.0 120.0 130.0 153.0 146.0 162.0 105.0 94.0 57.0
23 5 0.0 151.0 156.0 63.0 162.0 105.0 122.0 100.0 80.0
24 6 0.0 166.0 44.0 175.0 155.0 157.0 120.0 98.0 0.0
25 6 0.0 80.0 152.0 162.0 119.0 171.0 154.0 66.0 0.0
26 6 0.0 154.0 98.0 156.0 102.0 160.0 165.0 99.0 0.0
27 6 0.0 64.0 187.0 93.0 167.0 151.0 187.0 74.0 0.0
28 7 0.0 162.0 79.0 172.0 153.0 150.0 97.0 145.0 0.0
29 7 0.0 101.0 144.0 151.0 152.0 149.0 157.0 99.0 0.0
30 7 0.0 110.0 130.0 160.0 154.0 180.0 133.0 59.0 0.0
31 7 0.0 85.0 85.0 175.0 155.0 154.0 106.0 98.0 0.0
32 8 0.0 128.0 72.0 173.0 152.0 181.0 70.0 152.0 0.0
33 8 0.0 105.0 112.0 140.0 111.0 77.0 85.0 82.0 0.0
34 8 0.0 110.0 161.0 163.0 175.0 89.0 78.0 138.0 0.0
35 8 0.0 129.0 135.0 135.0 156.0 171.0 141.0 135.0 0.0
36 9 0.0 151.0 108.0 164.0 157.0 134.0 153.0 153.0 0.0
37 9 0.0 152.0 51.0 95.0 158.0 85.0 109.0 110.0 0.0
38 9 0.0 168.0 152.0 111.0 130.0 136.0 75.0 121.0 0.0
39 9 0.0 73.0 123.0 173.0 158.0 155.0 79.0 130.0 0.0
40 10 0.0 137.0 118.0 132.0 183.0 116.0 103.0 154.0 0.0
41 10 0.0 56.0 62.0 90.0 156.0 126.0 126.0 86.0 0.0
42 10 0.0 80.0 85.0 152.0 143.0 129.0 91.0 110.0 0.0
43 10 0.0 136.0 110.0 95.0 151.0 120.0 90.0 128.0 0.0
44 11 0.0 66.0 57.0 182.0 92.0 108.0 152.0 152.0 0.0
45 11 0.0 155.0 18.0 130.0 162.0 151.0 59.0 70.0 0.0
46 11 0.0 156.0 120.0 67.0 110.0 66.0 54.0 53.0 0.0
47 11 0.0 126.0 76.0 137.0 168.0 162.0 13.0 63.0 0.0

Air quality (AQI) correlation¶

In [230]:
corr_matrix = df_concatenated.corr()
corr_matrix
Out[230]:
Month AQI_2015 AQI_2016 AQI_2017 AQI_2018 AQI_2019 AQI_2020 AQI_2021 AQI_2022 AQI_2023
Month 1.000000 0.461729 0.027517 -0.083456 0.143352 0.548952 -0.196433 -0.062703 0.355402 -0.781821
AQI_2015 0.461729 1.000000 -0.216949 -0.349050 -0.246027 0.159908 -0.539733 -0.076364 0.098740 -0.227872
AQI_2016 0.027517 -0.216949 1.000000 0.013218 0.169906 -0.004429 0.179335 -0.149311 0.284263 -0.066636
AQI_2017 -0.083456 -0.349050 0.013218 1.000000 0.287847 0.034356 0.335555 0.410284 0.197767 -0.042916
AQI_2018 0.143352 -0.246027 0.169906 0.287847 1.000000 -0.008166 0.395382 0.326390 0.386774 -0.245813
AQI_2019 0.548952 0.159908 -0.004429 0.034356 -0.008166 1.000000 0.078618 0.097288 0.222024 -0.523639
AQI_2020 -0.196433 -0.539733 0.179335 0.335555 0.395382 0.078618 1.000000 0.149633 0.141697 -0.036097
AQI_2021 -0.062703 -0.076364 -0.149311 0.410284 0.326390 0.097288 0.149633 1.000000 0.165685 -0.081594
AQI_2022 0.355402 0.098740 0.284263 0.197767 0.386774 0.222024 0.141697 0.165685 1.000000 -0.281181
AQI_2023 -0.781821 -0.227872 -0.066636 -0.042916 -0.245813 -0.523639 -0.036097 -0.081594 -0.281181 1.000000
In [231]:
#Plot the pairplot of AQI from one to other years
sns.pairplot(df_concatenated[['AQI_2016','AQI_2017','AQI_2018','AQI_2019','AQI_2020','AQI_2021','AQI_2022','AQI_2023']], kind='reg')
Out[231]:
<seaborn.axisgrid.PairGrid at 0x29f9a44c0>
In [232]:
# Generate a correlation matrix for the AQI columns
corr_matrix = df_concatenated.iloc[:, 1:].corr()

# Plot a heatmap of the correlation matrix
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm')

# Set the title of the plot
plt.title('Heatmap of AQI correlation among all years')

# Show the plot
plt.show()
In [233]:
plt.figure(figsize=(15, 6), dpi=100)
plt.grid(True)

sns.lineplot(data=df_sampled, x=df_sampled['Date_LT'], y="AQI", hue="Year", ci=0, palette='Set1')
plt.xticks(rotation=45)
plt.xlabel('Date')
plt.title('AQI Monthly for Each Year')
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.2), ncol=3)
plt.show()
/var/folders/57/lkbzvbp92nv6k9m7r10l7d3w0000gn/T/ipykernel_23970/544724590.py:4: FutureWarning: 

The `ci` parameter is deprecated. Use `errorbar=('ci', 0)` for the same effect.

  sns.lineplot(data=df_sampled, x=df_sampled['Date_LT'], y="AQI", hue="Year", ci=0, palette='Set1')
¶